################################################################################
################################################################################
################################################################################
# THIS R SCRIPT REPLICATES ALL ANALYSES INCLUDED IN THE SUPPLEMENTAL INFORMATION 
# THAT PRODUCE DESCRIPTIVE STATISTICS OF THE WHITE HOUSE VISITOR LOGS FROM THE
# CLINTON AND OBAMA ADMINISTRATIONS. ANALYSES PRESENTED IN ORDER OF APPEARANCE.
################################################################################
################################################################################
################################################################################

library(data.table)
library(lubridate)
library(stringr)

# SET WORKING DIRECTORY TO APPROPRIATE LOCATION
# setwd("")

# READ IN VISITOR LOGS (WITH MATCHED LOBBYIST/VISITEE INFORMATION) FROM EACH
# ADMINISTRATION

clinton_logs <- fread("clinton_logs.csv", header = TRUE, stringsAsFactors = FALSE)
obama_logs <- fread("obama_logs.csv", header = TRUE, stringsAsFactors = FALSE)

################################################################################
# PAGE SI.10, IN-TEXT
# NUMBER OF VISITS/APPOINTMENTS IN CLINTON VISITOR LOGS

# NUMBER OF UNIQUE VISITS
length(unique(clinton_logs$WH_visit_id))
# NUMBER OF UNIQUE APPOINTMENTS
length(unique(clinton_logs$WH_uniq_appt_id))

################################################################################
# PAGE SI.10, IN-TEXT
# NUMBER OF VISITS/APPOINTMENTS IN OBAMA VISITOR LOGS

# NUMBER OF UNIQUE VISITS
length(unique(obama_logs$WH_visit_id))
# NUMBER OF UNIQUE APPOINTMENTS
length(unique(obama_logs$WH_uniq_appt_id))

################################################################################
# PAGE SI.11, FOOTNOTE SI.19
# COMPARING THE NUMBER OF VISITS IN OCTOBER 2009 DURING THE OBAMA ADMINISTRATION
# AND OCTOBER 2021 DURING THE BIDEN ADMINISTRATION

# READING IN FIRST YEAR OF VISITOR LOGS RELEASED BY BIDEN ADMINISTRATION ON THE
# WHITE HOUSE'S WEBSITE
biden_logs <- fread("biden_logs_2021.csv", header = TRUE, stringsAsFactors = FALSE)

# SUBSETTING TO ALL VISITS WHICH OCCURRED IN OCTOBER 2021
biden_logs_oct_2021 <- biden_logs[which(month(mdy_hms(biden_logs$APPT_START_DATE, 
                                                      truncated = 3))==10),]

# NUMBER OF WHITE HOUSE VISITS IN OCTOBER 2021
dim(biden_logs_oct_2021)[1]

# SUBSETTING TO ALL VISITS WHICH OCCURRED IN OCTOBER 2009
obama_logs_oct_2009 <- obama_logs[which(month(mdy_hms(obama_logs$APPT_START_DATE, 
                                                      truncated = 3))==10 & 
                                        year(mdy_hms(obama_logs$APPT_START_DATE, 
                                                      truncated = 3))==2009),]

# NUMBER OF WHITE HOUSE VISITS IN OCTOBER 2009
dim(obama_logs_oct_2009)[1]

################################################################################
# PAGE SI.12, IN-TEXT AND FOOTNOTE SI.21
# DISCUSSION OF DESCRIPTION FIELD/TOURS IN OBAMA VISITOR LOGS

# NUMBER/PERCENTAGE OF VISITOR LOG ENTRIES WITH BLANK DESCRIPTION FIELDS
dim(obama_logs[which(obama_logs$Description==""),])[1]
(round(dim(obama_logs[which(obama_logs$Description==""),])[1]/
        dim(obama_logs)[1],3))*100

# NUMBER/PERCENTAGE OF VISITOR LOG ENTRIES WHERE THE ONLY CONTENTS OF THE 
# DESCRIPTION FIELD ARE "GROUP TOUR"
sum(str_detect(tolower(obama_logs$Description), "^group tour$"), na.rm = TRUE)
(round(sum(str_detect(tolower(obama_logs$Description), "^group tour$"), 
          na.rm = TRUE)/dim(obama_logs)[1],3))*100

# NUMBER/PERCENTAGE OF VISITOR LOG ENTRIES WHICH MENTION "TOUR" OR ANOTHER WORD
# ASSOCIATED WITH A TOURIST AND/OR WHITE HOUSE SERVICE-RELATED VISIT

# NOTE: THE KEYWORDS UTILIZED TO MAKE THIS DETERMINATION WERE SELECTED BY THE
# AUTHOR AFTER READING THE FIRST 1000 MOST-COMMON DESCRIPTION FIELD ENTRIES AND
# DETERMINING WHICH WERE CLEARLY ASSOCIATED WITH TOURS; BESIDES THE WORD "TOUR"
# ITSELF, THE STRING "CLASS VISIT" WAS THE ONLY OTHER COMMON ENTRY CLEARLY 
# ASSOCIATED WITH TOURS

sum(str_detect(tolower(obama_logs$Description), "tour|class visit"), na.rm = TRUE)
(round(sum(str_detect(tolower(obama_logs$Description), "tour|class visit"), 
          na.rm = TRUE)/dim(obama_logs)[1],3))*100

################################################################################
# PAGE SI.21, TABLE SI.4
# NUMBER/PERCENTAGE OF UNIQUE VISITS/APPOINTMENTS WHERE THE VISITOR MATCHED 
# WITH A LOBBYIST

# THE BELOW CODE USES THE VISIT/APPOINTMENT UNIQUE IDENTIFIERS TO DETERMINE
# HOW MANY VISITS/APPOINTMENTS WERE MATCHED WITH A LOBBYIST IN EACH ADMINISTRATION

# CLINTON, NUMBER OF VISITS
# EXACT MATCHES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=0)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=0)]))/
    dim(clinton_logs)[1]),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=1)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=1)]))/
         dim(clinton_logs)[1]),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=2)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=2)]))/
         dim(clinton_logs)[1]),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=3)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=3)]))/
         dim(clinton_logs)[1]),3)*100

# CLINTON, NUMBER OF APPOINTMENTS
# EXACT MATCHES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=0)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=0)]))/
         length(unique(clinton_logs$WH_uniq_appt_id))),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=1)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=1)]))/
         length(unique(clinton_logs$WH_uniq_appt_id))),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=2)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=2)]))/
         length(unique(clinton_logs$WH_uniq_appt_id))),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=3)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=3)]))/
         length(unique(clinton_logs$WH_uniq_appt_id))),3)*100

# OBAMA, NUMBER OF VISITS
# EXACT MATCHES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=0)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=0)]))/
         dim(obama_logs)[1]),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=1)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=1)]))/
         dim(obama_logs)[1]),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=2)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=2)]))/
         dim(obama_logs)[1]),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=3)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=3)]))/
         dim(obama_logs)[1]),3)*100

# OBAMA, NUMBER OF APPOINTMENTS
# EXACT MATCHES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=0)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=0)]))/
         length(unique(obama_logs$WH_uniq_appt_id))),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=1)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=1)]))/
         length(unique(obama_logs$WH_uniq_appt_id))),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=2)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=2)]))/
         length(unique(obama_logs$WH_uniq_appt_id))),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=3)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=3)]))/
         length(unique(obama_logs$WH_uniq_appt_id))),3)*100

################################################################################
# PAGE SI.24, TABLE SI.5
# NUMBER/PERCENTAGE OF UNIQUE VISITS/APPOINTMENTS WHERE THE VISITOR MATCHED 
# WITH A LOBBYIST AND THE VISITEE IS IDENTIFIED

# THE BELOW CODE USES THE VISIT/APPOINTMENT UNIQUE IDENTIFIERS TO DETERMINE
# HOW MANY VISITS/APPOINTMENTS HAS IDENTIFIABLE PRINCIPALS/STAFFERS AMONG THOSE
# VISITS THAT WERE MATCHED WITH A LOBBYIST IN EACH ADMINISTRATION

# CLINTON, NUMBER OF VISITS
# EXACT MATCHES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=0 &
                                               clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=0 &
                                                      clinton_logs$visitee_known==1)]))/
         dim(clinton_logs[which(clinton_logs$visitor_edit_distance<=0)])[1]),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=1 &
                                               clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=1 &
                                                      clinton_logs$visitee_known==1)]))/
         dim(clinton_logs[which(clinton_logs$visitor_edit_distance<=1)])[1]),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=2 &
                                               clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=2 &
                                                      clinton_logs$visitee_known==1)]))/
         dim(clinton_logs[which(clinton_logs$visitor_edit_distance<=2)])[1]),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=3 &
                                               clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_visit_id[which(clinton_logs$visitor_edit_distance<=3 &
                                                      clinton_logs$visitee_known==1)]))/
         dim(clinton_logs[which(clinton_logs$visitor_edit_distance<=3)])[1]),3)*100

# CLINTON, NUMBER OF APPOINTMENTS
# EXACT MATCHES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=0 &
                                                   clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=0 &
                                                          clinton_logs$visitee_known==1)]))/
         length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=0)]))),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=1 &
                                                   clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=1 &
                                                          clinton_logs$visitee_known==1)]))/
         length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=1)]))),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=2 &
                                                   clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=2 &
                                                          clinton_logs$visitee_known==1)]))/
         length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=2)]))),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=3 &
                                                   clinton_logs$visitee_known==1)]))
round((length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=3 &
                                                          clinton_logs$visitee_known==1)]))/
         length(unique(clinton_logs$WH_uniq_appt_id[which(clinton_logs$visitor_edit_distance<=3)]))),3)*100

# OBAMA, NUMBER OF VISITS
# EXACT MATCHES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=0 &
                                               obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=0 &
                                                      obama_logs$visitee_known==1)]))/
         dim(obama_logs[which(obama_logs$visitor_edit_distance<=0)])[1]),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=1 &
                                               obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=1 &
                                                      obama_logs$visitee_known==1)]))/
         dim(obama_logs[which(obama_logs$visitor_edit_distance<=1)])[1]),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=2 &
                                               obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=2 &
                                                      obama_logs$visitee_known==1)]))/
         dim(obama_logs[which(obama_logs$visitor_edit_distance<=2)])[1]),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=3 &
                                               obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_visit_id[which(obama_logs$visitor_edit_distance<=3 &
                                                      obama_logs$visitee_known==1)]))/
         dim(obama_logs[which(obama_logs$visitor_edit_distance<=3)])[1]),3)*100

# OBAMA, NUMBER OF APPOINTMENTS
# EXACT MATCHES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=0 &
                                                   obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=0 &
                                                          obama_logs$visitee_known==1)]))/
         length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=0)]))),3)*100
# LEQ 1 EDIT DISTANCE
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=1 &
                                                   obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=1 &
                                                          obama_logs$visitee_known==1)]))/
         length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=1)]))),3)*100
# LEQ 2 EDIT DISTANCES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=2 &
                                                   obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=2 &
                                                          obama_logs$visitee_known==1)]))/
         length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=2)]))),3)*100
# LEQ 3 EDIT DISTANCES
length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=3 &
                                                   obama_logs$visitee_known==1)]))
round((length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=3 &
                                                          obama_logs$visitee_known==1)]))/
         length(unique(obama_logs$WH_uniq_appt_id[which(obama_logs$visitor_edit_distance<=3)]))),3)*100
